{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xpinyin import Pinyin\n",
    "import pandas as pd\n",
    "from keras.layers import LSTM, Dense\n",
    "from keras.models import Sequential\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = Pinyin()\n",
    "df = pd.read_csv('ChineseNames.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#change headers to english\n",
    "df.columns = ['Name', 'Gender']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#strip last name\n",
    "def strip_last_name(name):\n",
    "    return name[1:]\n",
    "\n",
    "def convert_to_pinyin(name):\n",
    "    return p.get_pinyin(name)\n",
    "\n",
    "def process_name(name):\n",
    "    name = strip_last_name(name)\n",
    "    name = convert_to_pinyin(name)\n",
    "    return name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Name Gender\n",
      "9792  左婉怡      F\n",
      "9793  左烜晅      F\n",
      "9794  左雨晴      F\n",
      "9795   左越      F\n",
      "9796  左子烨      F\n",
      "           Name Gender\n",
      "9792     wan-yi      F\n",
      "9793  xuan-xuan      F\n",
      "9794    yu-qing      F\n",
      "9795        yue      F\n",
      "9796      zi-ye      F\n"
     ]
    }
   ],
   "source": [
    "#visualize before after processing\n",
    "print(df.tail(5))\n",
    "df['Name'] = df['Name'].apply(process_name)\n",
    "print(df.tail(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#get all unique chars\n",
    "characters = {}\n",
    "for i, name in enumerate(df['Name']):\n",
    "    for char in name:\n",
    "        if char in characters:\n",
    "            characters[char] += 1\n",
    "        else:\n",
    "            characters[char] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#find the longest name\n",
    "maxlen = df['Name'].str.len().max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create mapping.  though it would be nice if they mapping is alphabetically ordered, but it doesn't matter mathematically.\n",
    "idx = 0\n",
    "for c in characters.keys():\n",
    "    characters[c] = idx\n",
    "    idx += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create training example of dimension (example #, timestep, # of features (length of 1-hot vector))\n",
    "num_examples = df.shape[0]\n",
    "time_steps = maxlen\n",
    "num_features = len(characters)\n",
    "def char_to_vec(char):\n",
    "    vector = np.zeros((1, num_features), dtype=int)\n",
    "    idx = characters[char]\n",
    "    vector[0,idx] = 1\n",
    "    return vector\n",
    "\n",
    "def name_to_vec(name):\n",
    "    example = np.zeros((maxlen, num_features), dtype = int)\n",
    "    for i in range(maxlen):\n",
    "        if i < len(name):\n",
    "            char = name[i]\n",
    "            vector = char_to_vec(char)\n",
    "            example[i,:] = vector[0,:]\n",
    "        else:\n",
    "            example[i,:] = np.zeros((num_features),dtype=int)\n",
    "    return example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#convert all examples to vector format\n",
    "X = np.empty((df.shape[0], maxlen, num_features))\n",
    "for i, name in enumerate(df['Name']):\n",
    "    example = name_to_vec(name)\n",
    "    X[i,:,:] = example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Y = df['Gender'].values\n",
    "Y[Y == 'M'] = 1\n",
    "Y[Y == 'F'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(LSTM(64, input_shape=(maxlen, num_features), return_sequences=False, dropout=0.2, recurrent_dropout=0.2))\n",
    "model.add(Dense(64, activation='tanh'))\n",
    "model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "# model.add(Dense(64, activation='tanh'))\n",
    "model.add(Dense(1, activation='tanh'))\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n",
      "9797/9797 [==============================] - 5s 500us/step - loss: 0.7708 - acc: 0.5552\n",
      "Epoch 2/10\n",
      "9797/9797 [==============================] - 4s 357us/step - loss: 0.6784 - acc: 0.5885\n",
      "Epoch 3/10\n",
      "9797/9797 [==============================] - 3s 340us/step - loss: 0.6658 - acc: 0.6017\n",
      "Epoch 4/10\n",
      "9797/9797 [==============================] - 3s 349us/step - loss: 0.6648 - acc: 0.6043\n",
      "Epoch 5/10\n",
      "9797/9797 [==============================] - 3s 354us/step - loss: 0.6566 - acc: 0.6128\n",
      "Epoch 6/10\n",
      "9797/9797 [==============================] - 4s 364us/step - loss: 0.6574 - acc: 0.6118\n",
      "Epoch 7/10\n",
      "9797/9797 [==============================] - 4s 374us/step - loss: 0.6519 - acc: 0.6194\n",
      "Epoch 8/10\n",
      "9797/9797 [==============================] - 4s 372us/step - loss: 0.6509 - acc: 0.6164\n",
      "Epoch 9/10\n",
      "9797/9797 [==============================] - 4s 389us/step - loss: 0.6528 - acc: 0.6213\n",
      "Epoch 10/10\n",
      "9797/9797 [==============================] - 4s 403us/step - loss: 0.6451 - acc: 0.6271\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x1752cb575c0>"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X,Y, batch_size=32, epochs=10, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "#let's test on some real life examples:\n",
    "X_test = np.empty((0, maxlen, num_features), dtype=int)\n",
    "Y_test = np.empty((0), dtype=int)\n",
    "people = ['long', #jackie chan\n",
    "          'xiao-long', #bruce lee\n",
    "          'bing-bing', #fan bingbing\n",
    "          'zi-yi', #zhang ziyi\n",
    "          'zi-dan', #donnie yen\n",
    "          'lian-jie', #jet li\n",
    "          'yu-ling', #lucy liu\n",
    "          'en-mei', #amy tan\n",
    "          'ming-na', #ming-na wen\n",
    "          'ze-dong', #chairman mao\n",
    "         'jin-ping'] #chairman xi\n",
    "Y_test = np.array([1,1,0,0,1,1,0,0,0,1,1])\n",
    "for name in people:\n",
    "    array = name_to_vec(name)\n",
    "    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mis-classified:  bing-bing\n",
      "mis-classified:  lian-jie\n",
      "mis-classified:  jin-ping\n",
      "\n",
      "accuracy:  0.7272727272727273\n"
     ]
    }
   ],
   "source": [
    "#predict\n",
    "predictions = model.predict_classes(X_test)\n",
    "for i, prediction in enumerate(predictions):\n",
    "    if prediction != Y_test[i]:\n",
    "        print('mis-classified: ', people[i])\n",
    "print()\n",
    "print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Result Analysis Part 1 - Famous People [72% Accuracy]\n",
    "The result is more impressive than I expected even after just 10 epochs of training. 72% accuracy is VERY HIGH for classifying chinese names.  I would have expected Bing Bing to be classified as female since any name ending in -ing is more commonly female than male in my experience.  I am however, not at all surprised that Jet Li and Chairman Xi were mis-classified since both Lian and Ping are both very feminine names in my experience. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "#some of the names that comes to mind from grade school\n",
    "X_test = np.empty((0, maxlen, num_features), dtype=int)\n",
    "Y_test = np.empty((0), dtype=int)\n",
    "people = ['ting-pei',\n",
    "          'xin',\n",
    "          'jin-hao',\n",
    "          'zhe-an',\n",
    "          'yi-cheng',\n",
    "          'zi-jun',\n",
    "          'zhi-hao',\n",
    "          'wei-han',\n",
    "          'guan-yu',\n",
    "          'xiu-qi',\n",
    "          'jun-de']\n",
    "Y_test = np.array([0,0,1,1,1,1,1,1,0,1,1])\n",
    "for name in people:\n",
    "    array = name_to_vec(name)\n",
    "    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mis-classified:  ting-pei\n",
      "mis-classified:  xiu-qi\n",
      "\n",
      "accuracy:  0.8181818181818182\n"
     ]
    }
   ],
   "source": [
    "#predict\n",
    "predictions = model.predict_classes(X_test)\n",
    "for i, prediction in enumerate(predictions):\n",
    "    if prediction != Y_test[i]:\n",
    "        print('mis-classified: ', people[i])\n",
    "print()\n",
    "print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Result Analysis Part 2 - Names from Grade School [72% Accuracy]\n",
    "Again we have 72% accuracy, which is awesome!  I was surprised that ting-pei was classified as male, because ting by itself will classify as female and pei is a more commonly a feminine name base on my experience. I knew both zi-jun and xiu-qi were going to be mis-classified right off the bat and they were. This shows the model's predictive power is similiar to that of people."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Result Analysis Part 3 - Names of Immediate Family Members [80% Accuracy]\n",
    "The model achieved a whopping 80% accuracy for 15 of my close family members. The names are withheld for privacy purposes. 3 of whom were mis-classified, and out of the 3, 2 of them were known to have names that leans towards their opposite gender."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mis-classified:  ping\n",
      "mis-classified:  hua\n",
      "mis-classified:  jia-yu\n",
      "\n",
      "accuracy:  0.7\n"
     ]
    }
   ],
   "source": [
    "#some of the names that comes to mind from grade school\n",
    "X_test = np.empty((0, maxlen, num_features), dtype=int)\n",
    "Y_test = np.empty((0), dtype=int)\n",
    "people = ['xiao-qi', \n",
    "          'fu-qi', \n",
    "          'ping', \n",
    "          'hua', \n",
    "          'yong-gang', \n",
    "          'jing-zhong', \n",
    "          'zhong-yu', \n",
    "          'jia-yu', \n",
    "          'yu-ling', \n",
    "          'rei-wen']\n",
    "Y_test = np.array([0,0,0,0,1,1,1,1,0,1])\n",
    "for name in people:\n",
    "    array = name_to_vec(name)\n",
    "    X_test = np.concatenate((X_test, np.expand_dims(array, 0)), 0)\n",
    "    \n",
    "#predict\n",
    "predictions = model.predict_classes(X_test)\n",
    "for i, prediction in enumerate(predictions):\n",
    "    if prediction != Y_test[i]:\n",
    "        print('mis-classified: ', people[i])\n",
    "print()\n",
    "print('accuracy: ', sum(predictions[:,0] == Y_test)/Y_test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reference:\n",
    "Dataset: https://www.researchgate.net/publication/269630594_9800_Chinese_Names_with_Gender\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
